/**
 * 澎湃 公共模块
 */

import Creeper, { _, extractFields } from '../../lib/creeper';

function thepaper(list: string[]) {
  const app = new Creeper({
    domains: ['www.thepaper.cn'],
    scanUrls: list,
    contentUrlRegexes: [/https:\/\/www\.thepaper\.cn\/newsDetail_forward_\d+/], // 内容页url正则
    fields: [
      {
        name: 'time',
        alias: '时间',
        selector: '//div[@class="news_about"]/p[2]',
        required: true,
      },
      {
        name: 'lead',
        alias: '简要',
        selector: '//meta[@name="Description"]/@content',
      },
      {
        name: 'content',
        alias: '内容',
        selector: '//div[@class="news_txt"]',
        required: true,
      },
    ],
  });

  app.onProcessScanPage((page, content, site) => {
    let iCount = 0;
    const re = /data-id="(\d+)"[^>]+>\s*<img\s+src="([^"]+)"\s+alt="([^"]+)/g;

    const map: { [k: string]: number } = {};

    content.replace(re, (m, id: string, img: string, title: string) => {
      if (map[id]) return m;
      map[id] = 1;

      const url = 'https://www.thepaper.cn/newsDetail_forward_' + id;

      site.addUrl(url); // 添加内容列表
      _.store.set(url, {
        title: title,
        image: _.fixImg(img, false),
      });

      iCount++;
      return m;
    });

    console.log(`发现 ${iCount} 个符合要求的文章，准备爬取...`);

    return false;
  });

  app.afterExtractField((fieldName, data, page, site, index) => {
    if (fieldName === 'time') {
      const dt = _.getTime(data + ':00');
      if (String(dt) === 'Invalid Date' || _.expired(dt, '24h')) {
        page.skip();
      }
      return String(+dt).slice(0, 10);
    }

    return extractFields(fieldName, data, page, site, index);
  });

  app.start();
}

export default thepaper;
